{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "ff605f45-3fee-46ff-bbcc-96a899a7a891",
   "metadata": {},
   "source": [
    "# Select File Extensions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "94b16cbf-2182-420e-a6b2-d27465579d4a",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "pd.set_option(\"display.max_rows\", None)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6a84df2b-7b51-430f-93f3-01564a6e376b",
   "metadata": {},
   "source": [
    "Link to Gist: https://gist.github.com/ppisarczyk/43962d06686722d26d176fad46879d41"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "ff488303-8a85-4692-8730-ef8cb01fce4c",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_json(\"https://gist.githubusercontent.com/ppisarczyk/43962d06686722d26d176fad46879d41/raw/211547723b4621a622fc56978d74aa416cbd1729/Programming_Languages_Extensions.json\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7e7122b5-dc14-4b8b-98b1-fc45a1369ea4",
   "metadata": {},
   "source": [
    "There 396 programming languages:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "14508ba2-6543-406a-9949-d8b5a8a6be90",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "396"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(df)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "24bae138-2117-4f75-aff0-5be4a376b2b6",
   "metadata": {},
   "source": [
    "## Look at Extension Collisions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "2e78273a-c76c-49ce-8d7b-db08bd3d54ca",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_coll = df.explode(\"extensions\").groupby(\"extensions\")['name'].apply(list).reset_index()\n",
    "df_coll[\"n\"] = df_coll[\"name\"].apply(len)\n",
    "df_coll[df_coll[\"n\"]>1].to_json(\"extension_clashes.json\", orient=\"records\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "69ef9138-d1ee-4deb-a365-08b30a524dc4",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "65"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(df_coll[df_coll[\"n\"]>1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "d13df427-3648-4650-86a3-de1ce257a463",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>extensions</th>\n",
       "      <th>name</th>\n",
       "      <th>n</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>49</th>\n",
       "      <td>.asc</td>\n",
       "      <td>[AGS Script, AsciiDoc, Public Key]</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>72</th>\n",
       "      <td>.b</td>\n",
       "      <td>[Brainfuck, Limbo]</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>77</th>\n",
       "      <td>.bb</td>\n",
       "      <td>[BitBake, BlitzBasic]</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>80</th>\n",
       "      <td>.bf</td>\n",
       "      <td>[Brainfuck, HyPhy]</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>87</th>\n",
       "      <td>.brd</td>\n",
       "      <td>[Eagle, KiCad]</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   extensions                                name  n\n",
       "49       .asc  [AGS Script, AsciiDoc, Public Key]  3\n",
       "72         .b                  [Brainfuck, Limbo]  2\n",
       "77        .bb               [BitBake, BlitzBasic]  2\n",
       "80        .bf                  [Brainfuck, HyPhy]  2\n",
       "87       .brd                      [Eagle, KiCad]  2"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_coll[df_coll[\"n\"]>1].head()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "408185fa-429d-424c-89ab-6101f9d5f39d",
   "metadata": {},
   "source": [
    "## Resolve conflicts"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "f992dfce-b5ee-4f24-ac72-990afd9a47f1",
   "metadata": {},
   "outputs": [],
   "source": [
    "langs_to_drop = [\"HyPhy\"]\n",
    "\n",
    "exts_to_drop = [\".asc\", \".b\", \".cgi\", \".ch\", \".cls\", \".d\", \".es\", \".fcgi\",\n",
    "                \".fr\", \".gml\", \".gs\", \".inc\", \".l\", \".m\", \".mod\", \".moo\",\n",
    "                \".ms\", \".n\", \".ncl\", \".nl\", \".pluginspec\", \".pp\", \".pro\",\n",
    "                \".sc\", \".tst\", \".v\", '.sublime-build', '.sublime-commands', \n",
    "                '.sublime-completions', '.sublime-keymap', '.sublime-macro',\n",
    "                '.sublime-menu', '.sublime-mousemap', '.sublime-project',\n",
    "                '.sublime-settings', '.sublime-theme', '.sublime-workspace',\n",
    "                '.sublime_metrics', '.sublime_session']\n",
    "\n",
    "exts_to_lang_map = [\n",
    "    (\".bb\", \"BitBake\"),\n",
    "    (\".brd\", \"Eagle\"),\n",
    "    (\".cake\", \"C#\"),\n",
    "    (\".cl\", \"OpenCL\"),\n",
    "    (\".cp\", \"C++\"),\n",
    "    (\".cs\", \"C#\"),\n",
    "    (\".ecl\", \"ECL\"),\n",
    "    (\".f\", \"FORTRAN\"),\n",
    "    (\".for\", \"FORTRAN\"),\n",
    "    (\".frag\", \"GLSL\"),\n",
    "    (\".fs\", \"F#\"),\n",
    "    (\".fx\", \"Flux\"),\n",
    "    (\".g\", \"G-code\"),\n",
    "    (\".gd\", \"GDScript\"),\n",
    "    (\".h\", \"C\"),\n",
    "    (\".hh\", \"C++\"),\n",
    "    (\".j\", \"Jasmin\"),\n",
    "    (\".lisp\", \"Common Lisp\"),\n",
    "    (\".ls\", \"LiveScript\"),\n",
    "    (\".lsp\", \"Common Lisp\"),\n",
    "    (\".mm\", \"Objective-C++\"),\n",
    "    (\".nb\", \"Mathematica\"),\n",
    "    (\".php\", \"PHP\"),\n",
    "    (\".pm\", \"Perl\"),\n",
    "    (\".pl\", \"Perl\"),\n",
    "    (\".pod\", \"Pod\"),\n",
    "    (\".r\", \"R\"),\n",
    "    (\".rpy\", \"Ren'Py\"),\n",
    "    (\".rs\", \"Rust\"),\n",
    "    (\".sch\", \"Eagle\"),\n",
    "    (\".sls\", \"SaltStack\"),\n",
    "    (\".sql\", \"SQL\"),\n",
    "    (\".st\", \"Smalltalk\"),\n",
    "    (\".t\", \"Perl\"),\n",
    "    (\".ts\", \"TypeScript\"),\n",
    "    (\".tsx\", \"TypeScript\"),\n",
    "    (\".vhost\", \"ApacheConf\"),\n",
    "]\n",
    "\n",
    "langs_to_map = {\n",
    "    \"M4Sugar\": \"M4\",\n",
    "    \"PLSQL\": \"SQL\",\n",
    "    \"PLpgSQL\": \"SQL\",\n",
    "    \"SQLPL\": \"SQL\",\n",
    "}\n",
    "\n",
    "langs_to_add = {\n",
    "    \"Maple\": [\".mpl\"],\n",
    "    \"Octave\": [\".oct\"],\n",
    "    \"WebAssembly\": [\".wat\"],\n",
    "    \"Solidity\": [\".sol\"],\n",
    "    \"Csound\": [\".csd\"],\n",
    "    \"Zig\": [\".zig\"],\n",
    "    \"C++\": ['.C', '.H'],\n",
    "    \"Dockerfile\": ['Dockerfile'],\n",
    "    \"Makefile\": ['Makefile'],\n",
    "    \n",
    "}"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7b753aa4-8463-4f9c-b623-360164a7508e",
   "metadata": {},
   "source": [
    "## Clean list"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e991de37-0d5b-4fd8-ab01-25b34669feb1",
   "metadata": {},
   "source": [
    "Add languages that are not in original list:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "b72d8dc3-6f17-4bd7-80b3-f49adc8afc7c",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_add = pd.DataFrame({\"name\": langs_to_add.keys(), \"extensions\": langs_to_add.values()})\n",
    "df_add[\"type\"] = \"programming\"\n",
    "df = df.append(df_add)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6c4da574-2ee9-4c5b-b04c-f604faa22f8a",
   "metadata": {},
   "source": [
    "Make each extension (~1000) a row:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "a9c3e3e8-a5cd-4f00-937e-e3296bc2f6c7",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1018"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_ext = df.explode(\"extensions\")\n",
    "len(df_ext)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6d1d5939-ab43-4e46-a446-f62bbeceb2eb",
   "metadata": {},
   "source": [
    "Remove languages without an extension:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "5c7acadc-253d-48ec-9bc1-39e000977166",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>name</th>\n",
       "      <th>type</th>\n",
       "      <th>extensions</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "Empty DataFrame\n",
       "Columns: [name, type, extensions]\n",
       "Index: []"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_ext = df_ext.dropna(subset=[\"extensions\"])\n",
    "df_ext[df_ext[\"extensions\"].isna()]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "3913bb32-968a-420d-bd30-848725dfc373",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>name</th>\n",
       "      <th>type</th>\n",
       "      <th>extensions</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>ABAP</td>\n",
       "      <td>programming</td>\n",
       "      <td>.abap</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>AGS Script</td>\n",
       "      <td>programming</td>\n",
       "      <td>.asc</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>AGS Script</td>\n",
       "      <td>programming</td>\n",
       "      <td>.ash</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>AMPL</td>\n",
       "      <td>programming</td>\n",
       "      <td>.ampl</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>AMPL</td>\n",
       "      <td>programming</td>\n",
       "      <td>.mod</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         name         type extensions\n",
       "0        ABAP  programming      .abap\n",
       "1  AGS Script  programming       .asc\n",
       "1  AGS Script  programming       .ash\n",
       "2        AMPL  programming      .ampl\n",
       "2        AMPL  programming       .mod"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_ext.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a0d98dbc-af8d-4686-91d4-d2a834add1b2",
   "metadata": {},
   "source": [
    "1. Rename some languages\n",
    "2. Remove some extension\n",
    "3. Remove some languages"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "f0f51dd5-51e4-419c-9c0c-67c40c64bdde",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_ext = df_ext.replace({\"name\": langs_to_map})\n",
    "df_ext = df_ext[~df_ext[\"extensions\"].isin(exts_to_drop)]\n",
    "df_ext = df_ext[~df_ext[\"name\"].isin(langs_to_drop)]\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0031bce6-787c-4fb3-aef2-b292b5867dec",
   "metadata": {},
   "source": [
    "Keep only extensions that with languages that are in the mapping:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "28500dbd-9e56-40a8-833d-49fc4f679369",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_ext[\"drop\"] = False\n",
    "for ex, lang in exts_to_lang_map:\n",
    "    df_ext[\"drop\"] = df_ext.apply(lambda x: (x[\"extensions\"]==ex and x[\"name\"]!=lang) or x[\"drop\"],  axis=1)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "960acdfe-0854-45f0-bb3b-6f572dd2f84f",
   "metadata": {},
   "source": [
    "Drops 45 extensions:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "2932d731-bede-4903-933e-d7ffb7c1e79e",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "45"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_ext[\"drop\"].sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "3fa0b5ad-f982-4c16-8329-057c61e6536a",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_ext = df_ext[~df_ext[\"drop\"]]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "396a2aae-f350-4edd-84a4-7410cd5c3fbc",
   "metadata": {},
   "source": [
    "After the language name mapping there can be duplicates:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "15b22a42-2c57-4a55-9b1a-0c4fa6e1217e",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_ext = df_ext.drop_duplicates(subset=[\"name\", \"extensions\"])"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "559f4cde-8e54-4a77-8193-af7d1ddcbe60",
   "metadata": {},
   "source": [
    "Check if an extension appears more than once:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "1c4fd807-be83-45e9-99b8-454aa87576a1",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('.abap', 1), ('.ash', 1), ('.ampl', 1), ('.g4', 1), ('.apib', 1)]"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from collections import Counter\n",
    "Counter(df_ext[\"extensions\"].to_list()).most_common(5)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d17dee9e-1ffc-4234-9549-1bf839c69feb",
   "metadata": {},
   "source": [
    "Number of extensions:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "249be402-939a-44e9-964c-77e9f6f85461",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "870"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(df_ext)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1f854f5c-ce36-4f1a-b41e-de06b64adee2",
   "metadata": {},
   "source": [
    "Merge all extensions such that there is one row per language:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "bf7eb9ae-57cd-4105-b146-092ec844c15e",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_result = df_ext.groupby(\"name\")['extensions'].apply(list).reset_index()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b0cd97d5-3ae1-4a95-9c94-d58a7d88d9e3",
   "metadata": {},
   "source": [
    "There are in total 370 extensions:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "af4ee69d-0cec-42e0-bce5-556dd5f6016e",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "370"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(df_result)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "60fadf95-7e01-4782-99f3-39ef67cf1b58",
   "metadata": {
    "scrolled": true,
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>name</th>\n",
       "      <th>extensions</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>ABAP</td>\n",
       "      <td>[.abap]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>AGS Script</td>\n",
       "      <td>[.ash]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>AMPL</td>\n",
       "      <td>[.ampl]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>ANTLR</td>\n",
       "      <td>[.g4]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>API Blueprint</td>\n",
       "      <td>[.apib]</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "            name extensions\n",
       "0           ABAP    [.abap]\n",
       "1     AGS Script     [.ash]\n",
       "2           AMPL    [.ampl]\n",
       "3          ANTLR      [.g4]\n",
       "4  API Blueprint    [.apib]"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_result.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4a3c44c9-a140-450c-a882-6aa110edd548",
   "metadata": {},
   "source": [
    "## Save"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "16bf27e1-01f4-4908-a2f6-1b94844dc6ef",
   "metadata": {},
   "source": [
    "Save list as JSON:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "f325234c-5004-46c3-8737-11a0d5a0d63f",
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "\n",
    "result_dict = {k: v for k, v in zip(df_result[\"name\"], df_result[\"extensions\"])}\n",
    "\n",
    "with open(\"programming-languages-to-file-extensions.json\", \"w\") as outfile:\n",
    "    json.dump(result_dict, outfile)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3933b55e-6c08-4527-bdff-a02852b6e7a4",
   "metadata": {},
   "source": [
    "## Compare list to v1.0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "a0fae6f4-b12f-4005-ac20-fcd1b52e95c0",
   "metadata": {},
   "outputs": [],
   "source": [
    "langs_30 = {\n",
    "    \"Assembly\": [\".asm\"],\n",
    "    \"Batchfile\": [\".bat\", \".cmd\"],\n",
    "    \"C\": [\".c\", \".h\"],\n",
    "    \"C#\": [\".cs\"],\n",
    "    \"C++\": [\".cpp\", \".hpp\", \".c++\", \".h++\", \".cc\", \".hh\", \".C\", \".H\"],\n",
    "    \"CMake\": [\".cmake\"],\n",
    "    \"CSS\": [\".css\"],\n",
    "    \"Dockerfile\": [\".dockerfile\", \"Dockerfile\"],\n",
    "    \"FORTRAN\": ['.f90', '.f', '.f03', '.f08', '.f77', '.f95', '.for', '.fpp'],\n",
    "    \"Go\": [\".go\"],\n",
    "    \"Haskell\": [\".hs\"],\n",
    "    \"HTML\":[\".html\"],\n",
    "    \"Java\": [\".java\"],\n",
    "    \"JavaScript\": [\".js\"],\n",
    "    \"Julia\": [\".jl\"],\n",
    "    \"Lua\": [\".lua\"],\n",
    "    \"Makefile\": [\"Makefile\"],\n",
    "    \"Markdown\": [\".md\", \".markdown\"],\n",
    "    \"PHP\": [\".php\", \".php3\", \".php4\", \".php5\", \".phps\", \".phpt\"],\n",
    "    \"Perl\": [\".pl\", \".pm\", \".pod\", \".perl\"],\n",
    "    \"PowerShell\": ['.ps1', '.psd1', '.psm1'],\n",
    "    \"Python\": [\".py\"],\n",
    "    \"Ruby\": [\".rb\"],\n",
    "    \"Rust\": [\".rs\"],\n",
    "    \"SQL\": [\".sql\"],\n",
    "    \"Scala\": [\".scala\"],\n",
    "    \"Shell\": [\".sh\", \".bash\", \".command\", \".zsh\"],\n",
    "    \"TypeScript\": [\".ts\", \".tsx\"],\n",
    "    \"TeX\": [\".tex\"],\n",
    "    \"Visual Basic\": [\".vb\"]\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "900e33e8-6e08-446d-982b-c2c543a4580c",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Assembly ['.a51', '.nasm']\n",
      "Batchfile []\n",
      "C ['.cats', '.idc', '.w']\n",
      "C# ['.cake', '.cshtml', '.csx']\n",
      "C++ ['.cp', '.cxx', '.hxx', '.inl', '.ipp', '.tcc', '.tpp']\n",
      "CMake ['.cmake.in']\n",
      "CSS []\n",
      "Dockerfile []\n",
      "FORTRAN []\n",
      "Go []\n",
      "Haskell ['.hsc']\n",
      "HTML ['.htm', '.html.hl', '.xht', '.xhtml']\n",
      "Java []\n",
      "JavaScript ['._js', '.bones', '.es6', '.jake', '.jsb', '.jscad', '.jsfl', '.jsm', '.jss', '.njs', '.pac', '.sjs', '.ssjs', '.xsjs', '.xsjslib']\n",
      "Julia []\n",
      "Lua ['.nse', '.pd_lua', '.rbxs', '.wlua']\n",
      "Makefile ['.mak', '.mk', '.mkfile']\n",
      "Markdown ['.mkd', '.mkdn', '.mkdown', '.ron']\n",
      "PHP ['.aw', '.ctp']\n",
      "Perl ['.al', '.ph', '.plx', '.psgi', '.t']\n",
      "PowerShell []\n",
      "Python ['.bzl', '.gyp', '.lmi', '.pyde', '.pyp', '.pyt', '.pyw', '.tac', '.wsgi', '.xpy']\n",
      "Ruby ['.builder', '.gemspec', '.god', '.irbrc', '.jbuilder', '.mspec', '.podspec', '.rabl', '.rake', '.rbuild', '.rbw', '.rbx', '.ru', '.ruby', '.thor', '.watchr']\n",
      "Rust ['.rs.in']\n",
      "SQL ['.pls', '.pck', '.pkb', '.pks', '.plb', '.plsql', '.cql', '.ddl', '.prc', '.tab', '.udf', '.viw', '.db2']\n",
      "Scala ['.sbt']\n",
      "Shell ['.bats', '.ksh', '.sh.in', '.tmux', '.tool']\n",
      "TypeScript []\n",
      "TeX ['.aux', '.bbx', '.bib', '.cbx', '.dtx', '.ins', '.lbx', '.ltx', '.mkii', '.mkiv', '.mkvi', '.sty', '.toc']\n",
      "Visual Basic ['.bas', '.frm', '.frx', '.vba', '.vbhtml', '.vbs']\n"
     ]
    }
   ],
   "source": [
    "for key in langs_30:\n",
    "    print(key, [r for r in result_dict[key] if r not in langs_30[key]])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "8862273e-2bd4-4e7e-87b6-e534449b8495",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Assembly []\n",
      "Batchfile []\n",
      "C []\n",
      "C# []\n",
      "C++ []\n",
      "CMake []\n",
      "CSS []\n",
      "Dockerfile []\n",
      "FORTRAN []\n",
      "Go []\n",
      "Haskell []\n",
      "HTML []\n",
      "Java []\n",
      "JavaScript []\n",
      "Julia []\n",
      "Lua []\n",
      "Makefile []\n",
      "Markdown []\n",
      "PHP []\n",
      "Perl ['.pod']\n",
      "PowerShell []\n",
      "Python []\n",
      "Ruby []\n",
      "Rust []\n",
      "SQL []\n",
      "Scala []\n",
      "Shell []\n",
      "TypeScript []\n",
      "TeX []\n",
      "Visual Basic []\n"
     ]
    }
   ],
   "source": [
    "for key in langs_30:\n",
    "    print(key, [r for r in langs_30[key] if r not in result_dict[key]])"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
